#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr) ; library(kableExtra)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(WGCNA)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd) and clustering (pipeline in 05_WGCNA.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Clusterings
clusterings = read_csv('./../Data/clusters.csv')
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)
genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))
clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
load('./../Data/GSEA.RData')
GSEA_old_SFARI = enrichment_old_SFARI
GSEA_SFARI = enrichment_SFARI
load('./../Data/ORA.RData')
ORA_old_SFARI = enrichment_old_SFARI
ORA_SFARI = enrichment_SFARI
rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds, enrichment_DGN, enrichment_DO, enrichment_GO,
enrichment_KEGG, enrichment_Reactome, enrichment_SFARI)
We have results both from GSEA and ORA to measure the enrichment of SFARI Genes in each module, and they both agree with each other relatively well
SFARI_genes_by_module = c()
for(module in names(GSEA_old_SFARI)){
GSEA_info = GSEA_old_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust, NES) %>%
mutate(pvalue = ifelse(NES>0, pvalue, 1-pvalue),
p.adjust = ifelse(NES>0, p.adjust, 1)) %>%
dplyr::rename('GSEA_pval' = pvalue, 'GSEA_padj'= p.adjust)
ORA_info = ORA_old_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust, qvalue, GeneRatio, Count) %>%
dplyr::rename('ORA_pval' = pvalue, 'ORA_padj' = p.adjust)
module_info = GSEA_info %>% full_join(ORA_info, by = 'ID') %>% add_column(.before = 'ID', Module = module)
SFARI_genes_by_module = rbind(SFARI_genes_by_module, module_info)
}
SFARI_genes_by_module = SFARI_genes_by_module %>%
left_join(dataset %>% dplyr::select(Module, MTcor) %>%
group_by(Module,MTcor) %>% tally %>% ungroup, by = 'Module') %>%
mutate(ORA_pval = ifelse(is.na(ORA_pval), 1, ORA_pval),
ORA_padj = ifelse(is.na(ORA_padj), 1, ORA_padj))
plot_data = SFARI_genes_by_module %>% filter(ID=='SFARI')
ggplotly(plot_data %>% ggplot(aes(1-GSEA_pval, 1-ORA_pval, size = n)) +
geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) +
geom_smooth(se=FALSE, color = '#CCCCCC') +
xlab('GSEA Enrichment') + ylab('ORA Enrichment') + coord_fixed() +
ggtitle(paste0('Corr = ', round(cor(plot_data$GSEA_pval, plot_data$ORA_pval),2))) +
theme_minimal() + theme(legend.position = 'none'))
To determine which modules have a statistically significant enrichment in SFARI Genes we can use the adjusted p-values. We used the Bonferroni correction for this.
GSEA identifies 20/55 as significant. This doesn’t make sense
ggplotly(plot_data %>% ggplot(aes(GSEA_padj, ORA_padj, size = n)) +
geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) +
geom_smooth(se=FALSE, color = '#CCCCCC') +
geom_hline(yintercept = 0.01, color = 'gray', linetype = 'dashed') +
geom_vline(xintercept = 0.01, color = 'gray', linetype = 'dashed') +
xlab('GSEA adjusted p-value') + ylab('ORA adjusted p-value') +
scale_x_log10(limits = c(min(plot_data$GSEA_padj, plot_data$ORA_padj),1.2)) +
scale_y_log10(limits = c(min(plot_data$GSEA_padj, plot_data$ORA_padj),1.2)) +
ggtitle(paste0('Corr = ',round(cor(plot_data$GSEA_padj, plot_data$ORA_padj),2))) + coord_fixed() +
theme_minimal() + theme(legend.position = 'none'))
plot_data = plot_data %>% mutate(GSEA_sig = GSEA_padj<0.01, ORA_sig = ORA_padj<0.01) %>%
apply_labels(GSEA_sig = 'GSEA significant enrichment',
ORA_sig = 'ORA significant enrichment')
cro(plot_data$GSEA_sig, list(plot_data$ORA_sig, total()))
| ORA significant enrichment | #Total | |||
|---|---|---|---|---|
| FALSE | TRUE | |||
| GSEA significant enrichment | ||||
| FALSE | 35 | 35 | ||
| TRUE | 17 | 3 | 20 | |
| #Total cases | 52 | 3 | 55 | |
The ‘over-enrichment’ in SFARI Modules in GSEA could be because SFARI Genes have in general higher Module Memberships than the other genes, which would make them cluster at the beginning of the list constantly and would bias the enrichment analysis.
Looking at the plot below, we can see that there is not a uniform distribution of SFARI genes across all quantiles of the Module Membership values, but they instead seem to cluster around Module Membership values with high magnitudes (both positive and negative), so I don’t think the GSEA results for the SFARI genes are valid.
Because of this, I’m going to use the enrichment from the ORA to study the SFARI Genes
quant_data = dataset %>% dplyr::select(ID, contains('MM.')) %>%
left_join(genes_info %>% dplyr::select(ID, gene.score), by = 'ID') %>% dplyr::select(-ID) %>%
melt %>% mutate(quant = cut(value, breaks = quantile(value, probs = seq(0,1,0.05)) %>%
as.vector, labels = FALSE)) %>%
group_by(gene.score, quant) %>% tally %>% ungroup %>% ungroup
quant_data = quant_data %>% group_by(quant) %>% summarise(N = sum(n)) %>% ungroup %>%
left_join(quant_data, by = 'quant') %>% dplyr::select(quant, gene.score, n, N) %>%
mutate(p = round(100*n/N,2)) %>% filter(!is.na(quant)) %>%
mutate(gene.score = factor(gene.score, levels=rev(c('1','2','3','4','5','6','Neuronal','Others'))))
ggplotly(quant_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>%
ggplot(aes(quant, p, fill = gene.score)) + geom_bar(stat='identity') +
xlab('Module Membership Quantiles') + ylab('% of SFARI Genes in Quantile') +
ggtitle('Percentage of Genes labelled as SFARI in each Quantile') +
scale_fill_manual(values = SFARI_colour_hue(r=rev(c(1:6)))) +
theme_minimal() + theme(legend.position = 'none'))
rm(quant_data)
Selecting modules with an adjusted p-value below 0.01 using the ORA
ggplotly(plot_data %>% ggplot(aes(MTcor, ORA_padj, size=n)) +
geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) +
geom_hline(yintercept = 0.01, color = 'gray', linetype = 'dotted') +
xlab('Module-Diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
ggtitle('Modules Significantly Enriched in SFARI Genes') +
theme_minimal() + theme(legend.position = 'none'))
top_modules = plot_data %>% arrange(desc(ORA_padj)) %>% filter(ORA_padj<0.01) %>% pull(Module) %>% as.character
plot_data %>% filter(Module %in% top_modules) %>% arrange(ORA_pval) %>%
dplyr::select(Module, MTcor, ORA_pval, ORA_padj, qvalue, GeneRatio, Count) %>%
rename( ORA_pval = 'p-value', ORA_padj = 'Adjusted p-value') %>%
kable %>% kable_styling(full_width = F)
| Module | MTcor | p-value | Adjusted p-value | qvalue | GeneRatio | Count |
|---|---|---|---|---|---|---|
| #96A900 | 0.0721036 | 5.30e-06 | 0.0000368 | 0.0000119 | 26/173 | 26 |
| #00C0BF | -0.5573039 | 1.83e-05 | 0.0001465 | 0.0000964 | 39/336 | 39 |
| #44A0FF | 0.6175450 | 5.48e-05 | 0.0003839 | 0.0001732 | 56/574 | 56 |
We lose Module #00BADE, which was significantly enriched with the new SFARI genes. The other three modules were also found to be significant in the new SFARI Genes, so details about these modules can be found in 08_to_SFARI_modules.html (I’m not going to repeat them here since there’s no new information)
In general, there isn’t a big change in SFARI enrichment between the two versions of the SFARI Genes
SFARI_genes_by_module = c()
for(module in names(ORA_old_SFARI)){
ORA_old_info = ORA_old_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust) %>%
dplyr::rename('pval_old_SFARI' = pvalue, 'padj_old_SFARI' = p.adjust)
ORA_info = ORA_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust) %>%
dplyr::rename('pval_SFARI' = pvalue, 'padj_SFARI' = p.adjust)
module_info = ORA_old_info %>% full_join(ORA_info, by = 'ID') %>% add_column(.before = 'ID', Module = module)
SFARI_genes_by_module = rbind(SFARI_genes_by_module, module_info)
}
SFARI_genes_by_module = SFARI_genes_by_module %>%
left_join(dataset %>% dplyr::select(Module, MTcor) %>%
group_by(Module,MTcor) %>% tally %>% ungroup, by = 'Module') %>%
mutate(pval_old_SFARI = ifelse(is.na(pval_old_SFARI), 1, pval_old_SFARI),
pval_SFARI = ifelse(is.na(pval_SFARI), 1, pval_SFARI),
padj_old_SFARI = ifelse(is.na(padj_old_SFARI), 1, padj_old_SFARI),
padj_SFARI = ifelse(is.na(padj_SFARI), 1, padj_SFARI))
plot_data = SFARI_genes_by_module %>% filter(ID=='SFARI')
ggplotly(plot_data %>% ggplot(aes(1-pval_old_SFARI, 1-pval_SFARI, size = n)) +
geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) +
geom_abline(slope = 1, intercept = 0, color = '#CCCCCC', linetype = 'dotted') +
xlab('Enrichment Old SFARI Genes') + ylab('Enrichment New SFARI Genes') + coord_fixed() +
ggtitle(paste0('Corr = ', round(cor(plot_data$pval_old_SFARI, plot_data$pval_SFARI),2))) +
theme_minimal() + theme(legend.position = 'none'))
Seems like we missed Module #00BADE from being significant by a really small margin, but it was close to being significant in both groups of genes
It seems like we almost gained two Modules, #FF63B6 and #00C08C, but they didn’t reach the threshold
The three shared significant modules have very low p-values in both groups of genes, nowhere near the threshold
ggplotly(plot_data %>% ggplot(aes(padj_old_SFARI, padj_SFARI, size = n)) +
geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) +
geom_hline(yintercept = 0.01, color = 'gray', linetype = 'dashed') +
geom_vline(xintercept = 0.01, color = 'gray', linetype = 'dashed') +
geom_abline(slope = 1, intercept = 0, color = '#CCCCCC', linetype = 'dotted') +
xlab('Adjusted p-value Old SFARI Genes') + ylab('Adjusted p-value New SFARI Genes') + coord_fixed() +
scale_x_log10(limits = c(min(plot_data$padj_old_SFARI, plot_data$padj_SFARI),1.2)) +
scale_y_log10(limits = c(min(plot_data$padj_old_SFARI, plot_data$padj_SFARI),1.2)) +
ggtitle(paste0('Corr = ', round(cor(plot_data$padj_old_SFARI, plot_data$padj_SFARI),2))) +
theme_minimal() + theme(legend.position = 'none'))
Enrichment doesn’t change much between SFARI Gene versions, sharing 3/4 statistically enriched modules and losing one just by a few points in its adjusted p-value, so the conclusions that we drew from the new SFARI Genes can be extended to the old SFARI Genes as well
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] WGCNA_1.69 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [4] org.Hs.eg.db_3.8.2 AnnotationDbi_1.46.1 IRanges_2.18.3
## [7] S4Vectors_0.22.1 Biobase_2.44.0 BiocGenerics_0.30.0
## [10] DOSE_3.10.2 ReactomePA_1.28.0 clusterProfiler_3.12.0
## [13] biomaRt_2.40.5 kableExtra_1.1.0 knitr_1.28
## [16] doParallel_1.0.15 iterators_1.0.12 foreach_1.5.0
## [19] polycor_0.7-10 expss_0.10.2 ggpubr_0.2.5
## [22] magrittr_1.5 GGally_1.5.0 gridExtra_2.3
## [25] viridis_0.5.1 viridisLite_0.3.0 RColorBrewer_1.1-2
## [28] dendextend_1.13.4 plotly_4.9.2 glue_1.4.1
## [31] reshape2_1.4.4 forcats_0.5.0 stringr_1.4.0
## [34] dplyr_1.0.0 purrr_0.3.4 readr_1.3.1
## [37] tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.2
## [40] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 RSQLite_2.2.0
## [3] htmlwidgets_1.5.1 grid_3.6.3
## [5] BiocParallel_1.18.1 munsell_0.5.0
## [7] codetools_0.2-16 preprocessCore_1.46.0
## [9] withr_2.2.0 colorspace_1.4-1
## [11] GOSemSim_2.10.0 highr_0.8
## [13] rstudioapi_0.11 ggsignif_0.6.0
## [15] labeling_0.3 urltools_1.7.3
## [17] GenomeInfoDbData_1.2.1 polyclip_1.10-0
## [19] bit64_0.9-7 farver_2.0.3
## [21] vctrs_0.3.1 generics_0.0.2
## [23] xfun_0.12 R6_2.4.1
## [25] GenomeInfoDb_1.20.0 graphlayouts_0.7.0
## [27] locfit_1.5-9.4 DelayedArray_0.10.0
## [29] bitops_1.0-6 reshape_0.8.8
## [31] fgsea_1.10.1 gridGraphics_0.5-0
## [33] assertthat_0.2.1 scales_1.1.1
## [35] ggraph_2.0.3 nnet_7.3-14
## [37] enrichplot_1.4.0 gtable_0.3.0
## [39] tidygraph_1.2.0 rlang_0.4.6
## [41] genefilter_1.66.0 splines_3.6.3
## [43] lazyeval_0.2.2 acepack_1.4.1
## [45] impute_1.58.0 broom_0.5.5
## [47] europepmc_0.4 checkmate_2.0.0
## [49] BiocManager_1.30.10 yaml_2.2.1
## [51] modelr_0.1.6 crosstalk_1.1.0.1
## [53] backports_1.1.8 qvalue_2.16.0
## [55] Hmisc_4.4-0 tools_3.6.3
## [57] ggplotify_0.0.5 ellipsis_0.3.1
## [59] ggridges_0.5.2 Rcpp_1.0.4.6
## [61] plyr_1.8.6 base64enc_0.1-3
## [63] progress_1.2.2 zlibbioc_1.30.0
## [65] RCurl_1.98-1.2 prettyunits_1.1.1
## [67] rpart_4.1-15 cowplot_1.0.0
## [69] SummarizedExperiment_1.14.1 haven_2.2.0
## [71] ggrepel_0.8.2 cluster_2.1.0
## [73] fs_1.4.0 data.table_1.12.8
## [75] DO.db_2.9 triebeard_0.3.0
## [77] reprex_0.3.0 reactome.db_1.68.0
## [79] matrixStats_0.56.0 xtable_1.8-4
## [81] hms_0.5.3 evaluate_0.14
## [83] XML_3.99-0.3 jpeg_0.1-8.1
## [85] readxl_1.3.1 compiler_3.6.3
## [87] crayon_1.3.4 htmltools_0.4.0
## [89] mgcv_1.8-31 Formula_1.2-3
## [91] geneplotter_1.62.0 lubridate_1.7.4
## [93] DBI_1.1.0 tweenr_1.0.1
## [95] dbplyr_1.4.2 MASS_7.3-51.6
## [97] rappdirs_0.3.1 Matrix_1.2-18
## [99] cli_2.0.2 igraph_1.2.5
## [101] GenomicRanges_1.36.1 pkgconfig_2.0.3
## [103] rvcheck_0.1.8 foreign_0.8-76
## [105] xml2_1.2.5 annotate_1.62.0
## [107] webshot_0.5.2 XVector_0.24.0
## [109] rvest_0.3.5 digest_0.6.25
## [111] graph_1.62.0 rmarkdown_2.1
## [113] cellranger_1.1.0 fastmatch_1.1-0
## [115] htmlTable_1.13.3 curl_4.3
## [117] graphite_1.30.0 lifecycle_0.2.0
## [119] nlme_3.1-147 jsonlite_1.7.0
## [121] fansi_0.4.1 pillar_1.4.4
## [123] lattice_0.20-41 httr_1.4.1
## [125] survival_3.1-12 GO.db_3.8.2
## [127] UpSetR_1.4.0 png_0.1-7
## [129] bit_1.1-15.2 ggforce_0.3.1
## [131] stringi_1.4.6 blob_1.2.1
## [133] DESeq2_1.24.0 latticeExtra_0.6-29
## [135] memoise_1.1.0